suppressMessages({
library("DESeq2")
library("edgeR")
library("limma")
library("RColorBrewer")
library("gplots")
library("ggplot2")
library("EnhancedVolcano")
library("factoextra")
library("devtools")
library("rstudioapi")
library("dplyr")
library("tibble")
library("tidyverse")
library("pheatmap")
library("biomaRt")
library("annotables")
library("org.Mm.eg.db")
library("biobroom")
library("clusterProfiler")
library("pathfindR")
})
current_path <- getActiveDocumentContext()$path
setwd(dirname(current_path ))
# Read the Full count matrix (File has been provided for download)
counttable_original<-read.delim("FULL_count_matrix.txt", header=T, row.names=1)
# View the count matrix
#View(counttable_original)
# Gene symbol as the identifier (when compared to ENSG ID)
counttable<-counttable_original[,c("Symbol","WT1","WT2","WT3","KO1","KO2","KO3")]
row.names(counttable) <- NULL
# Convert Column'GeneSymbol' to rowname)
rownames(counttable) <- counttable$Symbol
counttable<-counttable[,c("WT1","WT2","WT3","KO1","KO2","KO3")]
#View(counttable)
boxplot(log2((counttable)+1),las=3, col="red")
design = ~1.# Define a condition variable
condition=c("Wild","Wild","Wild","KO","KO","KO")
meta <- data.frame(row.names=colnames(counttable),condition)
#View(meta)
dds <- DESeqDataSetFromMatrix(countData = counttable,
colData = meta,
design = ~1)
For visualization or clustering it might be useful to work with transformed versions of the count data.
The most obvious choice of transformation is the logarithm.
This transforms the raw count data (which is heteroskedatic - variance grows with the mean) into homoskedatic data (variance is not dependant on the mean).
Both methods produce data on the log2 scale, and normalize for other factors such as library size. Setting blind=TRUE (the default) should be used to compare samples in a manner wholly unbiased about the information about experimental groups, for example to perform sample QC.
Note : In order to test for differential expression, we operate on raw counts (and not normalized/transformed counts).
vst <- vst(dds, blind = TRUE)
vst.data <- assay(vst)
#
# Regularized log (rlog) takes a long time with 50 or more samples
# rld <- rlog(dds, blind=FALSE)
boxplot(vst.data,las=3, col="red")
sampleDists <- dist(t(assay(vst)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vst$condition, vst$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)
plotPCA(vst)
pca=prcomp(t(assay(vst)),scale=FALSE)
options(repr.plot.width=0.5, repr.plot.height=0.5)
fviz_eig(pca, addlabels = TRUE)
#The function fviz_contrib() [in factoextra package] isused to visualize the contributions of rows/columns from the results of Principal Component Analysis (PCA)
fviz_contrib(pca, choice = "var", axes = 1, top = 100)
# You can see that the contribution to PCA is distributed over a large number of genes with varying proportions.
dds <- DESeqDataSetFromMatrix(countData = counttable,
colData = meta,
design = ~ condition)
colData = meta
In the above formula we have a : - A count matrix (countData) called counttable - A table of sample information called meta. - The design indicates how to model the samples, here, that we want to measure the effect of the condition - If we are controlling for batch differences then the design can be defined as design= ~ batch + condition. - The two factor variables batch and condition should be columns of coldata.
# How many gene rows before filtering?
nrow(dds)
## [1] 19859
keep <- rowSums(cpm(counttable)>1) >=4
dds <- dds[keep,]
# How many gene rows after filtering?
nrow(dds)
## [1] 13412
# Using factor
#dds$condition <- factor(dds$condition, levels = c("Wild","KO"))
#OR
# using relevel, just specifying the reference level:
dds$condition ~ relevel(dds$condition, ref="Wild")
## dds$condition ~ relevel(dds$condition, ref = "Wild")
dds <- DESeq(dds)
res <- results(dds)
# padj 0.05
res_padj0.05<-results(dds,alpha=0.05)
summary(res_padj0.05)
##
## out of 13412 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1092, 8.1%
## LFC < 0 (down) : 1216, 9.1%
## outliers [1] : 4, 0.03%
## low counts [2] : 0, 0%
## (mean count < 19)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
resSig005_subset<-subset(res_padj0.05, padj < 0.05)
write.table(resSig005_subset, "res_DeSeq2_FDR0.05_comparison_Wild_vs_KO_FUllMatrix.tab", sep="\t", col.names=NA, quote=F)
resSig005_subset <- data.frame(genes = row.names(resSig005_subset), resSig005_subset)
colnames(resSig005_subset)
## [1] "genes" "baseMean" "log2FoldChange" "lfcSE"
## [5] "stat" "pvalue" "padj"
# Writing normalized counts
normalised_counts<-counts(dds,normalized=TRUE)
write.table(normalised_counts, "normalised_all_samples_DeSeq2_FUllMatrix.tab", sep="\t", col.names=NA, quote=F)
## Merge with normalized count data
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)
## Gene baseMean log2FoldChange lfcSE stat pvalue padj
## 1 Zranb2 1539.04155 -0.4433732 0.2560683 -1.7314649 0.08336888 0.2168400
## 2 Miat 306.69384 0.4615508 0.3789418 1.2179991 0.22322430 0.4075817
## 3 Bcap31 3214.03476 -0.2571929 0.1763924 -1.4580728 0.14482048 0.3097883
## 4 Ctbs 122.19275 0.1418506 0.2097434 0.6763051 0.49884694 0.6744513
## 5 Maob 268.75441 0.1615179 0.1907347 0.8468196 0.39709569 0.5866952
## 6 Rgl3 31.67955 0.7941136 0.3918381 2.0266368 0.04269957 0.1407938
## WT1 WT2 WT3 KO1 KO2 KO3
## 1 1249.31302 1537.94879 1126.25485 2004.53215 2114.99447 1201.2060
## 2 280.84012 540.79507 244.37605 336.14083 283.75776 154.2532
## 3 2372.67350 3290.00418 3122.70094 3509.16255 3308.37570 3681.2917
## 4 133.61181 133.69097 116.87550 128.05365 115.90106 105.0235
## 5 286.79733 317.64171 247.56357 224.09389 274.96527 261.4647
## 6 46.80669 47.24418 26.56261 23.39442 26.37748 19.6919
## Volcano plot with "significant" genes labeled
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
if (labelsig) {
require(calibrate)
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
}
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green"))
}
#png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20)
volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2),ylim=c(0, 50))
?volcanoplot
plotDispEsts(dds, ylim = c(1e-6,1e3) )
dds_orig <- DESeqDataSetFromMatrix(countData = counttable, colData = colData,design = ~condition)
# IGFBP5
p_site_Sp110<-plotCounts(dds_orig, gene="Sp110", intgroup = "condition", returnData = TRUE) %>%
ggplot() + aes(dds_orig$condition, count) + geom_boxplot(aes(fill=dds_orig$condition)) + geom_jitter(color="black", size=0.6, alpha=0.9) + scale_y_log10() + theme_bw()+ggtitle("Sp110")+ theme(legend.position = "none")
p_site_Sp110
p_site_Krt2<-plotCounts(dds, gene="Krt2", intgroup = "condition", returnData = TRUE) %>%
ggplot() + aes(dds$condition, count) + geom_boxplot(aes(fill=dds$condition)) + geom_jitter(color="black", size=0.6, alpha=0.9) + scale_y_log10() + theme_bw()+ggtitle("Krt2")+ theme(legend.position = "none")
p_site_Krt2
res_tidy.DE = tidy.DESeqResults(res)
res_tidy.DE <- res_tidy.DE %>% arrange(p.adjusted) %>% inner_join(grcm38, by = c(gene = "symbol")) %>% dplyr::select(gene,baseMean, estimate, stderror, statistic, p.value, p.adjusted)
#View(res_tidy.DE)
# Prepare input.
# Filter for significant up and down regulated genes by P adjust and log fold change.
# P adj < 0.05
sig <- res_tidy.DE[res_tidy.DE$p.adjusted < 0.05, ]
# Upregulated: LFC > 1, remove NAs
sig.up <- sig[sig$estimate > 1, ]
sig.up <- na.omit(sig.up)
sig.up.LFC <- sig.up$estimate
names(sig.up.LFC) <- sig.up$gene
# Sort by LFC, decreasing
sig.up.LFC <- sort(sig.up.LFC, decreasing = TRUE)
# Downregulated: LFC < 1, remove NAs
sig.dn <- sig[sig$estimate < 1, ]
sig.dn <- na.omit(sig.dn)
sig.dn.LFC <- sig.dn$estimate
names(sig.dn.LFC) <- sig.dn$gene
# Sort by LFC, decreasing
sig.dn.LFC <- sort(sig.dn.LFC, decreasing = TRUE)
ego.up <- enrichGO(gene = names(sig.up.LFC),
OrgDb = org.Mm.eg.db,
keyType = 'SYMBOL',
readable = FALSE,
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.2)
barplot(ego.up, showCategory=20)
dotplot(ego.up, showCategory=20,font.size = 10)
cnetplot(ego.up,
categorySize="pvalue",
foldChange=sig.up.LFC,
cex_label_gene = 1,
showCategory = 5,cex_label_category=1.2,shadowtext='category')
heatplot(ego.up)
ego.dn <- enrichGO(gene = names(sig.dn.LFC),
OrgDb = org.Mm.eg.db,
keyType = 'SYMBOL',
readable = FALSE,
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.2)
barplot(ego.dn, showCategory=20)
dotplot(ego.dn, showCategory=20,font.size = 10)
cnetplot(ego.dn,
categorySize="pvalue",
foldChange=sig.dn.LFC,
cex_label_gene = 0.7,
showCategory = 5,cex_label_category=1.5,shadowtext='category')
heatplot(ego.dn)